## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
## Script to reproduce all quantitative analysis
## reported in 
## Müller, Stefan (2023). How Slack Facilitates Communication 
## and Collaboration in Seminars and Project-Based Classes.
## Journal of Educational Technology Systems. DOI: https://doi.org/10.1177/00472395231151910
## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 


# load packages
library(tidyverse)
library(scales)    # CRAN v1.2.1
library(lubridate) # CRAN v1.8.0
library(ggdist)    # CRAN v3.2.0
library(xtable)    # CRAN v1.8-4



# set your working directory here, use the here() package,
# or create an RProj in this folder
setwd("")


# load custom ggplot2 scheme
theme_baser <- function (){
    theme_minimal()  %+replace%
        theme(panel.grid.minor.x = element_blank(),
              panel.grid.minor.y = element_blank(),
              panel.grid.major.x = element_blank(),
              panel.grid.major.y = element_blank(),
              panel.border = element_rect(fill = NA,colour = "black", size = 0.5,
                                          linetype = "solid"),
              legend.title = element_text(size = 15),
              plot.caption = element_text(colour = "grey30", size = 11, hjust = 1),
              plot.title = element_text(size = 19, face = "bold",
                                        vjust = 1.5, hjust = 0.5,
                                        margin=margin(0, 0, 12 ,0)),
              legend.position = "bottom",
              axis.ticks.y = element_line(size = 0.3),
              axis.ticks.x = element_line(size = 0.3),
              axis.ticks.length = unit(0.2, "cm"),
              legend.text=element_text(size = 13),
              panel.background = element_rect(fill='white'), #transparent panel bg
              plot.background = element_rect(fill='white', color= "white"), #transparent plot bg
              strip.text = element_text(size = 14, hjust = 0.5,
                                        face = "bold",
                                        margin = margin(b = 5, r = 5, l = 5, t = 5)),
              axis.text.y = element_text(colour = "black", size = 13,
                                         hjust = 1),
              axis.text.x = element_text(colour = "black", size = 13),
              axis.title = element_text(size = 13, hjust = 0.5))
}


# set theme
theme_set(theme_baser())

# load dataset
dat_all_raw <- read.csv("data_slack.csv",
                        fileEncoding = "utf-8")


# adjust coding of module delivery
dat_all <- dat_all_raw %>% 
    mutate(delivery = ifelse(module %in% c("Project-Based Course (2021)",
                                           "Introduction to Statistics (2020)"),
                             "Online", "In Person")) 

# transform date column to date format
dat_all$date <- as.Date(dat_all$date)

# specify reading weeks
date_break <- c("2021-03-08", "2021-03-15", "2020-11-26",
                "2021-10-25", "2022-03-07", "2022-03-14")


# when unit is weeks, specify the reference day. 
# 7 represents Sunday and 1 represents Monday
                   # create relevant variables of members, messages, and activities
dat_all <- dat_all %>% 
    mutate(week = floor_date(date, unit = "weeks",
                             week_start = getOption("lubridate.week.start", 1))) %>%
    mutate(sum_messages_channels = Messages.in.public.channels + Messages.in.private.channels) %>% 
    mutate(sum_messages_channels_dms = sum_messages_channels + Messages.in.DMs) |> 
    mutate(sum_members = ifelse(is.na(Total.enabled.membership), Total.Members,
                                Total.enabled.membership)) |> 
    mutate(active_only = Daily.active.members - Daily.members.posting.messages) |> 
    mutate(active_only = ifelse(active_only < 0, 0, active_only)) |>  # for three days active only has negative values - recode to 0
    mutate(perc_active = 100 * Daily.active.members / sum_members,
           perc_active_only = 100 * active_only / sum_members,
           perc_posting = 100 * Daily.members.posting.messages / sum_members) |> 
    mutate(percentage_difference = (perc_active - perc_posting) / perc_posting * 100)



# get sum of messages per module and week 
dat_sum <- dat_all %>% 
    mutate(week = floor_date(date, unit = "weeks",
                             week_start = getOption("lubridate.week.start", 1))) %>%
    group_by(module) %>% 
    mutate(prop_active_members = Daily.active.members / max(students)) %>% 
    mutate(prop_posting = Daily.members.posting.messages / max(students)) %>% 
    mutate(sum_messages_channels = Messages.in.public.channels + Messages.in.private.channels) %>% 
    group_by(module, week, students) %>% 
    summarise(sd_active = sd(prop_active_members),
              mean_active = mean(prop_active_members),
              sum_message = sum(sum_messages_channels),
              sum_message_dm = sum(sum_messages_channels_dms),
              sd_sum_message = sd(sum_messages_channels), 
              mean_posting = mean(prop_posting)) %>% 
    group_by(module) %>% 
    mutate(cumulative_sum = cumsum(sum_message))

# start of reading weeks
date_break <- c("2021-03-08", "2021-03-15", "2020-10-26",
                "2021-10-25", "2022-03-07", "2022-03-14")

table(dat_sum$week)

# determine reading breaks
dat_sum <- dat_sum %>% 
    mutate(reading_break = ifelse(week %in% as.Date(date_break), "Break", "Term")) %>% 
    group_by(module) %>% 
    mutate(week_num = 1:n())


dat_sum$reading_break <- fct_rev(dat_sum$reading_break)

# retrieve summary statistics for paper
dat_sum %>% 
    group_by(module) %>% 
    summarise(mean_week = mean(sum_message),
              mean_with_dms = mean(sum_message_dm))


dat_sum %>% 
    ungroup() %>% 
    summarise(mean_week = mean(sum_message),
              mean_with_dms = mean(sum_message_dm))


# clean up labels and create sum of total messages
dat_sum <- dat_sum %>% 
    mutate(delivery = ifelse(module %in% c("Project-Based Course (2021)",
                                           "Introduction to Statistics (2020)"),
                             "Online", "In Person")) %>% 
    mutate(module = gsub("\\(2020\\)|\\(2021\\)|\\(2022\\)", "", module)) |> 
    mutate(module = str_squish(module)) |> 
    mutate(sum_total = sum_message + sum_message_dm) |> 
    mutate(label = paste0(module, " (", delivery, ")"))


dat_sum_des <- dat_sum %>% 
    group_by(label) %>% 
    summarise(mean = mean(sum_total),
              median = median(sum_total))

# specify levels of factor
levels_modules <- c("Introduction to Statistics (Online)",
                    "Introduction to Statistics (In Person)",
                    "Project-Based Course (Online)", 
                    "Project-Based Course (In Person)")

dat_sum$label <- factor(dat_sum$label,
                        levels = levels_modules)

dat_sum_des$label <- factor(dat_sum_des$label,
                            levels = levels_modules)

# Figure 1
ggplot(dat_sum, aes(x = factor(week_num),
                    y = sum_total,
                    colour = reading_break)) + 
    geom_line(group = 1, colour = "black") +
    geom_point(size = 5) +
    scale_y_continuous(limits = c(-5, 1000)) +
    scale_colour_manual(values = c("black", "grey70")) +
    facet_wrap(~label, scales = "free_x") +
    labs(x = "Week of Teaching Term", y = "Number of Messages per Week") +
    geom_text(dat_sum_des, mapping = aes(x = "1", y = 900,
                                         label = paste0("Mean=", 
                                                        round(mean, 0), 
                                                        "\nMedian=", median)),
              inherit.aes = FALSE, hjust = 0, size = 5,
              colour = "grey50") +
    theme(legend.position = "none")
ggsave("fig_01.pdf",
       width = 9, height = 5.5)
ggsave("fig_01.png",
       width = 9, height = 5.5)


# gest sum of messages
sum_messages <- dat_all |> 
    group_by(module, delivery) |> 
    summarise(sum_messages_all = sum(sum_messages_channels_dms),
              sum_messages_channels = sum(sum_messages_channels),
              sum_messages_private = sum(Messages.in.private.channels),
              sum_messages_public = sum(Messages.in.public.channels))


sum(dat_all$sum_messages_channels_dms)
sum(dat_all$sum_messages_channels)

# difference between active and passive users
dat_diff <- dat_all %>%
    mutate(diff = Daily.active.members - Daily.members.posting.messages) %>% 
    mutate(module_students = paste0(module, ": ", students, " Students"))



# transform to long format and get number of 
# students for the three groups
dat_diff_long <- dat_diff %>% 
    mutate(passive_only = Daily.active.members - Daily.members.posting.messages) |> 
    select(module_students, module, passive_only, Daily.active.members, 
           Daily.members.posting.messages, date) %>% 
    gather(var, students, -c(date, module_students, module))


                   # create labels for active, passive users, and active + passive users
dat_diff_long <- dat_diff_long %>% 
    mutate(var = case_when(
        str_detect(var, "posting") ~ "Active (Posting) Users",
        str_detect(var, "passive") ~ "Passive Users", 
        str_detect(var, "Daily.active.members") ~ "Active + Passive Users"
    ))


# clean module names
dat_diff_long_clean <- dat_diff_long %>%
    mutate(delivery = ifelse(module %in% c("Project-Based Course (2021)",
                                           "Introduction to Statistics (2020)"),
                             "Online", "In Person")) %>% 
    separate(module, into = c("module", "year"), sep = " \\(")  


# change factor levels 
dat_diff_long_clean$delivery <- relevel(factor(dat_diff_long_clean$delivery),
                                        ref = "Online")

dat_diff_long_clean$var <- factor(dat_diff_long_clean$var,
                                  levels = c("Active + Passive Users",
                                             "Passive Users",
                                             "Active (Posting) Users"))


# transform to long format and calculate proportions
dat_diff_long_perc <- dat_diff %>%
    select(module_students, module, perc_active,
           perc_active_only, perc_posting, date) %>%
    gather(var, percent, -c(date, module_students, module)) |>
    mutate(prop = percent / 100)


# get different types of users
dat_diff_long_perc <- dat_diff_long_perc %>% 
    mutate(var = case_when(
        str_detect(var, "perc_posting") ~ "Active Users",
        str_detect(var, "perc_active_only") ~ "Passive Users", 
        str_detect(var, "perc_active") ~ "Active + Passive Users"
    ))


# clean module names
dat_diff_long_perc <- dat_diff_long_perc %>%
    mutate(delivery = ifelse(module %in% c("Project-Based Course (2021)",
                                           "Introduction to Statistics (2020)"),
                             "Online", "In Person")) %>% 
    separate(module, into = c("module", "year"), sep = " \\(")  


# relevel factors for plot
dat_diff_long_perc$delivery <- relevel(factor(dat_diff_long_perc$delivery),
                                       ref = "Online")


dat_diff_long_perc$var <- factor(dat_diff_long_perc$var,
                                 levels = c("Active + Passive Users",
                                            "Passive Users",
                                            "Active Users"))

dat_diff_long_perc <- dat_diff_long_perc |> 
    mutate(label = paste0(module, " (", delivery, ")"))

# get average for each course and type of users
dat_diff_means <- dat_diff_long_perc |> 
    group_by(var, label) |> 
    summarise(mean = mean(prop))

dat_diff_means

# From the ggdist documentation
# The .width argument passed to point_interval: 
# a vector of probabilities to use that determine the widths
# of the resulting intervals.
# If multiple probabilities are provided, multiple intervals 
# per group are generated, 
# each with a different probability interval 
# (and value of the corresponding .width and level generated variables).

# determine levels for plot
levels_modules <- c("Introduction to Statistics (Online)",
                    "Introduction to Statistics (In Person)",
                    "Project-Based Course (Online)", 
                    "Project-Based Course (In Person)")


# reverse order of facet labels (Introduction to Statistics first)
dat_diff_long_perc$label <- factor(dat_diff_long_perc$label,
                                   levels = levels_modules)

dat_diff_means$label <- factor(dat_diff_means$label,
                               levels = levels_modules)


# Figure 2
ggplot(filter(dat_diff_long_perc),
       aes(x = prop, y = var)) +
    stat_halfeye(point_interval = "mean_qi", 
                 fill = "grey80",
                 .width = c(0.50, 0.95)) +
    geom_text(data = dat_diff_means,
              aes(label = paste0(round(mean, 2) * 100, "%"),
                  x = mean),
              nudge_y = 0.3, size = 4) +
    facet_wrap(~label)  +
    scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
    labs(y = NULL, x = "Percentage of Users") +
    theme(legend.position = "none")
ggsave("fig_02.pdf",
       width = 9, height = 5.5)
ggsave("fig_02.png",
       width = 9, height = 5.5)


# get table with overview of messages
dat_sum_table <- dat_sum %>% 
    rename(Module = module, Delivery = delivery) %>% 
    group_by(Module, Delivery) %>% 
    summarise(`Channels (Mean)` = mean(sum_message),
              `Channels (Median)` = median(sum_message),
              `Direct M. (Mean)` = mean(sum_message_dm),
              `Direct M. (Median)` = median(sum_message_dm))

dat_sum_table

# overview of modules
dat_students <- data.frame(
    Module = c("Project-Based Course", "Project-Based Course",
               "Introduction to Statistics", "Introduction to Statistics"),
    Delivery  = c("Online", "In Person", "Online", "In Person"),
    Students = c(13, 12, 23, 44)
)

# clean table
dat_sum_table <- dat_sum_table |> 
    mutate(Module = gsub("\\(2020\\)|\\(2021\\)|\\(2022\\)", "", Module)) |> 
    mutate(Module = str_squish(Module))

# merge student numbers with data frame 
dat_sum_table <- left_join(dat_sum_table, dat_students)


# Table 1 (.tex)
print(xtable(dat_sum_table,
             caption.placement = "top",
             digits = 0,
             align=c(
                 "p{0\\textwidth}",
                 "p{0.25\\textwidth}",
                 "p{0.15\\textwidth}",
                 "p{0.1\\textwidth}",
                 "p{0.1\\textwidth}",
                 "p{0.1\\textwidth}",
                 "p{0.09\\textwidth}",
                 "p{0.1\\textwidth}"
                 
             ),
             label = "tab:summary",
             caption = "Summary statistics of weekly Slack messages (Channels: public and private channels; Direct M.: direct messages between students or a student and the instructor)"),
      include.rownames = FALSE,
      type = "latex",
      caption.placement = "top",
      size = "footnotesize",
      file = "tab_01.tex")


# Table 1 (.html)
print(xtable(dat_sum_table,
             caption.placement = "top",
             digits = 0,
             caption = "Summary statistics of weekly Slack messages (Channels: public and private channels; Direct M.: direct messages between students or a student and the instructor)"),
      include.rownames = FALSE,
      type = "html",
      caption.placement = "top",
      size = "footnotesize",
      file = "tab_01.html")



# summary stats of activity
dat_activity_sum <- dat_diff |> 
    group_by(module, delivery) |> 
    summarise(`Active` = paste0(round(mean(perc_posting),1), "%"),
              `Passive` = paste0(round(mean(perc_active_only), 1), "%"),
              `Active + Passive` = paste0(round(mean(perc_active), 1), "%"))


# get activity statistics for weekdays only
dat_activity_sum_weekdays <- dat_diff |> 
    mutate(wday = wday(date, label = TRUE)) |> 
    filter(!wday %in% c("Sun", "Sat")) |> 
    group_by(module, delivery) |> 
    summarise(`Active (weekdays)` = paste0(round(mean(perc_posting), 1), "%"),
              `Passive (weekdays)` = paste0(round(mean(perc_active_only), 1), "%"),
              `Active + Passive (weekdays)` = paste0(round(mean(perc_active), 1), "%"))

# merge full data and weekdays data
dat_activity_tab <- left_join(dat_activity_sum,
                              dat_activity_sum_weekdays) |> 
    rename(Module = module, Delivery = delivery)


# clean up strings
dat_activity_tab <- dat_activity_tab %>% 
    mutate(Module = str_replace_all(Module, "Introduction", "Intro")) |> 
    mutate(Module = gsub(" \\(2020\\)| \\(2021\\)| \\(2022\\)", "", Module)) # remove years

# select relevant variables
dat_activity_tab <- dat_activity_tab |> 
    select(Module, Delivery, Active, `Active (weekdays)`,
           Passive, `Passive (weekdays)`, everything()) 

# Table 2 (.tex file)
print(xtable(dat_activity_tab,
             caption.placement = "top",
             digits = 0,
             align=c(
                 "p{0\\textwidth}",
                 "p{0.22\\textwidth}",
                 "p{0.12\\textwidth}",
                 "p{0.09\\textwidth}",
                 "p{0.09\\textwidth}",
                 "p{0.09\\textwidth}",
                 "p{0.09\\textwidth}",
                 "p{0.09\\textwidth}",
                 "p{0.09\\textwidth}"
                 
             ),
             label = "tab:activity",
             caption = "Average daily levels of activity"),
      include.rownames = FALSE,
      type = "latex",
      caption.placement = "top",
      size = "footnotesize",
      file = "tab_02.tex")


# Table 2 (.html file file)
print(xtable(dat_activity_tab,
             caption.placement = "top",
             digits = 0,
             caption = "Average daily levels of activity"),
      include.rownames = FALSE,
      type = "html",
      caption.placement = "top",
      size = "footnotesize",
      file = "tab_02.html")



# get summary stats for full data frame
# note: some of these numbers are reported in the paper
dat_diff |> 
    ungroup() |> 
    summarise(mean_active = mean(perc_active),
              median_active = median(perc_active))

# repeat for weekdays
dat_diff |> 
    mutate(wday = wday(date, label = TRUE)) |> 
    filter(!wday %in% c("Sun", "Sat")) |> 
    ungroup() |> 
    summarise(mean_active = mean(perc_active),
              mean_post = mean(perc_posting),
              median_active = median(perc_active))

# separately for both modules
dat_diff |> 
    mutate(wday = wday(date, label = TRUE)) |> 
    filter(!wday %in% c("Sun", "Sat")) |> 
    mutate(module = substr(module, 1, 6)) |> 
    group_by(module) |> 
    summarise(mean_active = mean(perc_active),
              mean_post = mean(perc_posting),
              median_active = median(perc_active))

dat_diff_sum <- dat_diff %>% 
    group_by(module, students, module_students) %>% 
    summarise(mean_diff = mean(diff),
              max_diff = max(diff),
              min_diff = min(diff),
              sd_diff = sd(diff))

dat_diff_sum

